Take Home Exercise 3

Author

Kieren Chua

Published

November 1, 2024

Modified

Invalid Date

Part 1 : Data And Packages

Packages

pacman::p_load(sf, spdep, GWmodel, SpatialML, 
               tmap, rsample, Metrics, tidyverse,
               knitr, kableExtra, jsonlite)

Data

We can get the lat long data from the previous output

location_data <- read_rds("data/processed_data/coords.rds")

resale_data <- read_csv("data/resale.csv")
Rows: 192970 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): month, town, flat_type, block, street_name, storey_range, flat_mode...
dbl (3): floor_area_sqm, lease_commence_date, resale_price

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
resale_tidy <- resale_data %>%
  mutate(address = paste(block,street_name)) %>%
  mutate(remaining_lease_yr = as.integer(
    str_sub(remaining_lease, 0, 2)))%>%
  mutate(remaining_lease_mth = as.integer(
    str_sub(remaining_lease, 9, 11)))

We also got data from the other required decision parameters, which are :

Structural factors

  1. Area of the unit
  2. Floor level Remaining
  3. lease
  4. Age of the unit

Locational factors

  1. Proxomity to CBD
  2. Proximity to eldercare
  3. Proximity to foodcourt/hawker centres
  4. Proximity to MRT
  5. Proximity to park
  6. Proximity to good primary school ( All schools are good schools lol)
  7. Proximity to shopping mall
  8. Proximity to supermarket
  9. Numbers of kindergartens within 350m
  10. Numbers of childcare centres within 350m
  11. Numbers of bus stop within 350m
  12. Numbers of primary school within 1km
CBD_lat_long <- c(1.287953, 103.851784) # Taken from https://www.latlong.net/place/downtown-core-singapore-20616.html

CBD_svy21 <- st_sfc(st_point(c(103.851784, 1.287953)), 
                    crs = 4326) %>%
                    st_transform(3414)
eldercare_data <- st_read(dsn = "data/EldercareServicesSHP", 
                          layer = "ELDERCARE") %>% st_transform(3414)
Reading layer `ELDERCARE' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\EldercareServicesSHP' 
  using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
foodcourt_data <- st_read("data/HawkerCentresGEOJSON.geojson") %>% st_transform(3414)
Reading layer `HawkerCentresGEOJSON' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\HawkerCentresGEOJSON.geojson' 
  using driver `GeoJSON'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449017
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
MRT_data <- st_read("data/LTAMRTStationExitGEOJSON.geojson") %>% st_transform(3414)
Reading layer `LTAMRTStationExitGEOJSON' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\LTAMRTStationExitGEOJSON.geojson' 
  using driver `GeoJSON'
Simple feature collection with 563 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6368 ymin: 1.264972 xmax: 103.9893 ymax: 1.449157
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
park_data <- st_read("data/Parks.kml") %>% st_transform(3414)
Reading layer `NATIONALPARKS' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\Parks.kml' 
  using driver `KML'
Simple feature collection with 430 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6929 ymin: 1.214491 xmax: 104.0538 ymax: 1.462094
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
primarySchool_data <- st_read("data/LTASchoolZone.geojson") %>% st_transform(3414)
Reading layer `LTASchoolZone' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\LTASchoolZone.geojson' 
  using driver `GeoJSON'
Simple feature collection with 211 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension:     XY, XYZ
Bounding box:  xmin: 103.687 ymin: 1.272736 xmax: 103.9668 ymax: 1.457587
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
Warning in CPL_transform(x, crs, aoi, pipeline, reverse, desired_accuracy, :
GDAL Message 1: Sub-geometry 0 has coordinate dimension 2, but container has 3
Warning in CPL_transform(x, crs, aoi, pipeline, reverse, desired_accuracy, :
GDAL Message 1: Sub-geometry 1 has coordinate dimension 2, but container has 3
mall_data <- st_read(dsn = "data/MP14SDCPPWPLANMallandPromenadeSHP", 
                     layer="G_MP14_PKWB_MALL_PROM_PL") %>% st_transform(3414)
Reading layer `G_MP14_PKWB_MALL_PROM_PL' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\MP14SDCPPWPLANMallandPromenadeSHP' 
  using driver `ESRI Shapefile'
Simple feature collection with 464 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 15576.2 ymin: 24936 xmax: 40537.72 ymax: 48239.39
Projected CRS: SVY21
supermarket_data <- st_read("data/SupermarketsGEOJSON.geojson") %>% st_transform(3414)
Reading layer `SupermarketsGEOJSON' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\SupermarketsGEOJSON.geojson' 
  using driver `GeoJSON'
Simple feature collection with 526 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6258 ymin: 1.24715 xmax: 104.0036 ymax: 1.461526
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
kindergarten_data <- st_read("data/Kindergartens.geojson") %>% st_transform(3414)
Reading layer `Kindergartens' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\Kindergartens.geojson' 
  using driver `GeoJSON'
Simple feature collection with 448 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6887 ymin: 1.247759 xmax: 103.9717 ymax: 1.455452
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
childcare_data <- st_read("data/ChildCareServices.geojson") %>% st_transform(3414)
Reading layer `ChildCareServices' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\ChildCareServices.geojson' 
  using driver `GeoJSON'
Simple feature collection with 1925 features and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: 103.6878 ymin: 1.247759 xmax: 103.9897 ymax: 1.462134
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
busstop_data <- st_read(dsn = "data/BusStopLocation_Jul2024",
                        layer= "BusStop") %>% st_transform(3414)
Reading layer `BusStop' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\BusStopLocation_Jul2024' 
  using driver `ESRI Shapefile'
Simple feature collection with 5166 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 3970.122 ymin: 26482.1 xmax: 48285.52 ymax: 52983.82
Projected CRS: SVY21

Part 2 : Processing the data

Now we need to add all the data to the dataframe. We will also jitter the geomoetries of dependent variables just in case they overlap.

Locations of HDB

We can use the location data and join by postal code

resale_tidy_loc <- left_join(resale_tidy, location_data, by = "address")
resale_tidy_clean <- resale_tidy_loc %>% 
                filter(!is.na(postal))

# Then we convert the lat long into SVY21
resale_sf <- st_as_sf(resale_tidy_clean, 
                      coords = c("longitude", "latitude"),
                      crs = 4326) %>%
            st_transform(3414)

# Handle the NA in the lease months
resale_sf$remaining_lease_mth[is.na(resale_sf$remaining_lease_mth)] <- 0

We also need to jitter the points so that the points do not share the same coordinates, we need to jitter quite abit for the regression to work latter.

resale_sf$geometry <- st_jitter(resale_sf$geometry, amount = 2)

We can now show the data

tmap_mode("plot")
tmap mode set to plotting
tm_shape(resale_sf) + 
  tm_dots()

Unit Age

resale_sf$unit_age <- 99 - resale_sf$remaining_lease_yr

Proximity to CBD

We can compare all locations to the single CBD coordinate and output a distance

distance_matrix <- st_distance(resale_sf$geometry, CBD_svy21)

resale_sf$PROX_CBD <- apply(distance_matrix, 1, min)

Proximity to Eldercare

In this case we can find the closest elder-care to the HDB unit

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(eldercare_data$geometry, amount = 2))

resale_sf$PROX_ELDER <- apply(distance_matrix, 1, min)

Proximity to Hawker Center

We do the same thing here

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(foodcourt_data$geometry), amount = 2))

resale_sf$PROX_HAWKER <- apply(distance_matrix, 1, min)

Proximity to MRT

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(MRT_data$geometry), amount = 2))

resale_sf$PROX_MRT <- apply(distance_matrix, 1, min)

Proximity to Park

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(park_data$geometry), amount = 2))

resale_sf$PROX_PARK <- apply(distance_matrix, 1, min)

Proximity to Primary School

We need to get the center of the primary school to compare against centroid. Need to drop the z value

primarySchool_data$geometry <- st_zm(primarySchool_data$geometry)
primarySchool_data$centroid <- st_centroid(primarySchool_data$geometry)
distance_matrix <- st_distance(resale_sf$geometry, st_jitter(primarySchool_data$centroid, amount = 2))

resale_sf$PROX_PRIM <- apply(distance_matrix, 1, min)

Proximity to Shopping Mall

Same for the shopping mall

mall_data$centroid <- st_centroid(mall_data$geometry)
distance_matrix <- st_distance(resale_sf$geometry, st_jitter(mall_data$centroid, amount = 2))

resale_sf$PROX_MALL <- apply(distance_matrix, 1, min)

Proximity to Supermarket

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(supermarket_data$geometry), amount = 2))

resale_sf$PROX_SPMK <- apply(distance_matrix, 1, min)

Number of Kindergartens within 350m

To calculate the number of kindergartens within 350m of the HBD, we need to have a 350m search radius around each location, then count the number of kindergartens within

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(kindergarten_data$geometry), amount = 2))

count_within_350m <- apply(distance_matrix, 1, function(distances) {
  sum(distances <= 350)  # Count points within 350 meters
})

resale_sf$KIND_350 <- count_within_350m

Number of Childcares within 350m

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(st_zm(childcare_data$geometry), amount = 2))

count_within_350m <- apply(distance_matrix, 1, function(distances) {
  sum(distances <= 350)  # Count points within 350 meters
})

resale_sf$CHILD_350 <- count_within_350m

Number of Bus-Stops within 350m

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(busstop_data$geometry, amount = 2))

count_within_350m <- apply(distance_matrix, 1, function(distances) {
  sum(distances <= 350)  # Count points within 350 meters
})

resale_sf$BUS_350 <- count_within_350m

Number of Primary School within 1000m

distance_matrix <- st_distance(resale_sf$geometry, st_jitter(primarySchool_data$centroid, amount = 2))

count_within_1km <- apply(distance_matrix, 1, function(distances) {
  sum(distances <= 1000)  # Count points within 350 meters
})

resale_sf$PRI_1K <- count_within_1km

Saving the data

Now we can save the data for future purposes.

write_rds(resale_sf, "data/resale_sf_processed.rds")

Part 3 : Shrinking the search space

Read the data

cleaned_resale_sf <- read_rds("data/resale_sf_processed.rds")
cleaned_resale_no_geom <- cleaned_resale_sf %>% st_drop_geometry()

Because there is too much data, we will need to reduce the size of inspection. First we shall determine the types of flats available.

unique_flat_types <- unique(cleaned_resale_sf$flat_type)
unique_flat_types
[1] "3 ROOM"    "4 ROOM"    "5 ROOM"    "EXECUTIVE" "2 ROOM"   

We also want to see the types of flats that are available

unique_flat_models <- unique(cleaned_resale_sf$flat_model)
unique_flat_models
 [1] "New Generation"         "DBSS"                   "Improved"              
 [4] "Apartment"              "Simplified"             "Model A"               
 [7] "Model A-Maisonette"     "Maisonette"             "Standard"              
[10] "Premium Apartment"      "Type S1"                "Model A2"              
[13] "Type S2"                "Adjoined flat"          "Premium Apartment Loft"
[16] "2-room"                 "Premium Maisonette"     "3Gen"                  

To have more focus on the data, we shall focus on the more expensive flat models vs the exercise given to us. To get an idea of what is expensive, we will need to see the spread of flat prices by their specific prices. We can get the mean price of each house type

mean_prices <- cleaned_resale_sf %>%
  group_by(flat_model) %>%
  summarise(mean_price = mean(resale_price, na.rm = TRUE))
print(mean_prices)
Simple feature collection with 18 features and 2 fields
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: 11596.29 ymin: 28155.59 xmax: 42443.22 ymax: 48684.82
Projected CRS: SVY21 / Singapore TM
# A tibble: 18 × 3
   flat_model             mean_price                                    geometry
   <chr>                       <dbl>                              <GEOMETRY [m]>
 1 2-room                    344621. MULTIPOINT ((14055.98 35918.41), (15368.29…
 2 3Gen                      733630. MULTIPOINT ((14419.71 36874.42), (14420.79…
 3 Adjoined flat             738722. MULTIPOINT ((14362.2 36503.4), (14744.58 3…
 4 Apartment                 703122. MULTIPOINT ((11635.28 36023.49), (11635.71…
 5 DBSS                      744929. MULTIPOINT ((15809.26 34519.1), (15809.32 …
 6 Improved                  510734. MULTIPOINT ((11754.27 35787.75), (11755.02…
 7 Maisonette                749470. MULTIPOINT ((11635.58 36022.97), (11635.61…
 8 Model A                   526007. MULTIPOINT ((11596.29 35823.18), (11596.43…
 9 Model A-Maisonette        721084. MULTIPOINT ((15672.38 37430.78), (15672.54…
10 Model A2                  406797. MULTIPOINT ((12884.38 35299.85), (12884.53…
11 New Generation            370598. MULTIPOINT ((14868.62 36798.73), (14868.83…
12 Premium Apartment         570944. MULTIPOINT ((12548.43 35413.45), (12548.45…
13 Premium Apartment Loft    997578. MULTIPOINT ((35118.51 42850.13), (35118.54…
14 Premium Maisonette        808000                    POINT (41889.84 38258.25)
15 Simplified                384347. MULTIPOINT ((18337.97 38162.39), (18338.1 …
16 Standard                  352766. MULTIPOINT ((14904.17 36251.67), (14905.91…
17 Type S1                  1016522. MULTIPOINT ((28849.27 28921.27), (28849.28…
18 Type S2                  1130472. MULTIPOINT ((28849.28 28921.19), (28849.35…

We can also see the mean prices by flat type

mean_prices <- cleaned_resale_sf %>%
  group_by(flat_type) %>%
  summarise(mean_price = mean(resale_price, na.rm = TRUE))
print(mean_prices)
Simple feature collection with 5 features and 2 fields
Geometry type: MULTIPOINT
Dimension:     XY
Bounding box:  xmin: 11596.29 ymin: 28155.59 xmax: 42443.22 ymax: 48684.82
Projected CRS: SVY21 / Singapore TM
# A tibble: 5 × 3
  flat_type mean_price                                                  geometry
  <chr>          <dbl>                                          <MULTIPOINT [m]>
1 2 ROOM       297474. ((14054.47 35918.34), (14054.52 35916.76), (14055.98 359…
2 3 ROOM       369544. ((11596.29 35823.18), (11596.43 35823.58), (11596.44 358…
3 4 ROOM       548271. ((11596.57 35822.46), (11597.06 35822.48), (11597.47 358…
4 5 ROOM       630283. ((11754.27 35787.75), (11755.02 35789.95), (11755.11 357…
5 EXECUTIVE    709138. ((11635.28 36023.49), (11635.58 36022.97), (11635.61 360…

Since the class exercise did the model for 2 and 3 room flat, we shall try to do a housing market that is higher in value which is the executive flats.

cleaned_resale_sf_cut <- cleaned_resale_sf %>% filter(flat_type %in% c('EXECUTIVE'))

cleaned_resale_sf_cut_no_geom <- cleaned_resale_sf_cut %>%
                                  st_drop_geometry()

This leaves us with over 1000 data samples, which should be significant enough for us.

print(nrow(cleaned_resale_sf_cut))
[1] 2294

Part 4 : Computing Correlation Matrix

Bin the data

We need to bin some of the variables so that they make integers

Storeys

unqiue_storey <- unique(cleaned_resale_sf_cut_no_geom$storey_range)
unqiue_storey
[1] "07 TO 09" "01 TO 03" "13 TO 15" "10 TO 12" "04 TO 06" "16 TO 18" "19 TO 21"
[8] "22 TO 24"

We need to map it by height, where a low value corresponds to a low height

# We need to arrange the the mapping by height
calculate_height <- function(range) {
  # Split the range to get the lower and upper bounds
  bounds <- as.numeric(unlist(strsplit(range, " TO ")))

  avg_height <- mean(bounds)
  return(avg_height)
}

heights <- sapply(unqiue_storey, calculate_height)

# Make the mapping
height_mapping <- setNames(sapply(unqiue_storey, calculate_height), unqiue_storey)

We then apply this mapping to the dataframe

cleaned_resale_sf_cut_no_geom$storey_range_bin <- height_mapping[cleaned_resale_sf_cut_no_geom$storey_range]
cleaned_resale_sf_cut$storey_range_bin <- height_mapping[cleaned_resale_sf_cut$storey_range]

Plotting the graph

We are not sure if all the variables are correlated, so we can build a correlation matrix to see if we need to exclude any variables

required_cols <- c(7, 9, 13, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)
corrplot::corrplot(cor(cleaned_resale_sf_cut_no_geom[, required_cols]), 
                   diag = FALSE, 
                   order = "AOE",
                   tl.pos = "td", 
                   tl.cex = 0.5, 
                   method = "number", 
                   type = "upper")

We can see that lease commence date and remain lease yr have high correlation, so we remove lease commence date.

Variance Inflation Factor

We also can check the Variance Inflation Factor to see if there are any variables above a 5

Train-Test Split

First we need to do a train test split for any model training. Split shall be 65 - 35.

set.seed(1234)
# First we try to remove any NA values
cleaned_resale_sf_cut <- cleaned_resale_sf_cut[rowSums(is.na(st_drop_geometry(cleaned_resale_sf_cut))) == 0,, ]

resale_split <- initial_split(cleaned_resale_sf_cut, 
                              prop = 6.5/10,)

train_data <- training(resale_split)

We need to check for overlaps in the train data to see if there is data we need to remove.

overlap_matrix <- st_equals(train_data$geometry)
overlap_matrix <- sapply(overlap_matrix, function(x) length(x) > 1)
any_overlap <- any(overlap_matrix)
any_overlap
[1] FALSE

Generating simple LM Model

price_mlr <- lm(resale_price ~ floor_area_sqm + storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL +
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
                data=train_data)
vif <- performance::check_collinearity(price_mlr)
kable(vif, 
      caption = "Variance Inflation Factor (VIF) Results") %>%
  kable_styling(font_size = 18) 
Variance Inflation Factor (VIF) Results
Term VIF VIF_CI_low VIF_CI_high SE_factor Tolerance Tolerance_CI_low Tolerance_CI_high
floor_area_sqm 1.623501 1.520385 1.747050 1.274167 0.6159527 0.5723934 0.6577280
storey_range_bin 1.070079 1.030984 1.158501 1.034446 0.9345105 0.8631842 0.9699468
remaining_lease_yr 2.058889 1.913492 2.227427 1.434883 0.4856990 0.4489484 0.5226048
PROX_CBD 1.497694 1.407022 1.608565 1.223803 0.6676931 0.6216720 0.7107210
PROX_ELDER 1.439628 1.354780 1.544768 1.199845 0.6946237 0.6473462 0.7381270
PROX_HAWKER 1.156997 1.103171 1.238907 1.075638 0.8643062 0.8071632 0.9064781
PROX_MRT 1.325093 1.252013 1.419366 1.151127 0.7546640 0.7045401 0.7987139
PROX_PARK 1.367000 1.289557 1.465157 1.169188 0.7315287 0.6825208 0.7754603
PROX_MALL 1.188684 1.130829 1.272125 1.090268 0.8412662 0.7860862 0.8843071
PROX_SPMK 1.326603 1.253364 1.421013 1.151782 0.7538050 0.7037233 0.7978528
KIND_350 1.262790 1.196398 1.351627 1.123739 0.7918972 0.7398492 0.8358425
CHILD_350 1.460526 1.373574 1.567717 1.208522 0.6846846 0.6378701 0.7280276
BUS_350 1.285403 1.216548 1.376152 1.133756 0.7779659 0.7266638 0.8219978
PRI_1K 1.509446 1.417603 1.621488 1.228595 0.6624948 0.6167174 0.7054164

We can also plot this out for better visualization

plot(vif) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
Variable `Component` is not in your data frame :/

Part 5 : Generating Geographically Weighted Predictive Models

Convert to Spatial Datframe

# First we check for NA values in the traindata

train_data_sp <- as_Spatial(train_data)

Get adaptive bandwidth

bw_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
                  data=train_data_sp,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Adaptive bandwidth: 929 CV score: 1.805548e+13 
Adaptive bandwidth: 582 CV score: 1.718301e+13 
Adaptive bandwidth: 367 CV score: 1.587955e+13 
Adaptive bandwidth: 234 CV score: 1.459566e+13 
Adaptive bandwidth: 152 CV score: 1.286563e+13 
Adaptive bandwidth: 101 CV score: 1.098861e+13 
Adaptive bandwidth: 70 CV score: 9.428636e+12 
Adaptive bandwidth: 50 CV score: 8.560562e+12 
Adaptive bandwidth: 38 CV score: 6.147621e+18 
Adaptive bandwidth: 57 CV score: 8.725395e+12 
Adaptive bandwidth: 45 CV score: 8.371286e+12 
Adaptive bandwidth: 42 CV score: 3.279266e+13 
Adaptive bandwidth: 47 CV score: 8.404789e+12 
Adaptive bandwidth: 44 CV score: 8.271245e+12 
Adaptive bandwidth: 43 CV score: 8.178387e+12 
Adaptive bandwidth: 42 CV score: 3.279266e+13 
Adaptive bandwidth: 43 CV score: 8.178387e+12 

We will also save it for the future

write_rds(bw_adaptive, "data/model/bw_adaptive.rds")
bw_adaptive <- read_rds("data/model/bw_adaptive.rds")

Make the model

Now we will make the adaptive GWR model

gwr_adaptive <- gwr.basic(formula = resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
                          data=train_data_sp,
                          bw=bw_adaptive, 
                          kernel = 'gaussian', 
                          adaptive=TRUE,
                          longlat = FALSE)

Save a copy

write_rds(gwr_adaptive, "data/model/gwr_adaptive.rds")

Reading the model

gwr_adaptive <- read_rds("data/model/gwr_adaptive.rds")
gwr_adaptive
   ***********************************************************************
   *                       Package   GWmodel                             *
   ***********************************************************************
   Program starts at: 2024-11-09 01:25:02.931115 
   Call:
   gwr.basic(formula = resale_price ~ floor_area_sqm + storey_range_bin + 
    remaining_lease_yr + PROX_CBD + PROX_ELDER + PROX_HAWKER + 
    PROX_MRT + PROX_PARK + PROX_MALL + PROX_SPMK + KIND_350 + 
    CHILD_350 + BUS_350 + PRI_1K, data = train_data_sp, bw = bw_adaptive, 
    kernel = "gaussian", adaptive = TRUE, longlat = FALSE)

   Dependent (y) variable:  resale_price
   Independent variables:  floor_area_sqm storey_range_bin remaining_lease_yr PROX_CBD PROX_ELDER PROX_HAWKER PROX_MRT PROX_PARK PROX_MALL PROX_SPMK KIND_350 CHILD_350 BUS_350 PRI_1K
   Number of data points: 1491
   ***********************************************************************
   *                    Results of Global Regression                     *
   ***********************************************************************

   Call:
    lm(formula = formula, data = data)

   Residuals:
    Min      1Q  Median      3Q     Max 
-316399  -82169  -14910   77393  544489 

   Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
   (Intercept)         9.668e+05  7.836e+04  12.338  < 2e-16 ***
   floor_area_sqm      3.203e+03  3.474e+02   9.221  < 2e-16 ***
   storey_range_bin    6.405e+03  7.529e+02   8.507  < 2e-16 ***
   remaining_lease_yr -7.654e+03  5.901e+02 -12.971  < 2e-16 ***
   PROX_CBD           -1.340e+01  1.079e+00 -12.415  < 2e-16 ***
   PROX_ELDER         -9.235e+00  5.237e+00  -1.763 0.078045 .  
   PROX_HAWKER        -1.292e+01  6.943e+00  -1.861 0.063001 .  
   PROX_MRT           -4.270e+01  8.089e+00  -5.279 1.49e-07 ***
   PROX_PARK          -5.888e-01  8.002e+00  -0.074 0.941358    
   PROX_MALL          -6.623e-01  2.184e+00  -0.303 0.761726    
   PROX_SPMK          -7.921e+01  2.392e+01  -3.311 0.000951 ***
   KIND_350            2.699e+03  2.976e+03   0.907 0.364523    
   CHILD_350           1.822e+03  1.366e+03   1.334 0.182384    
   BUS_350             2.789e+03  1.078e+03   2.587 0.009769 ** 
   PRI_1K             -2.203e+03  2.260e+03  -0.974 0.329978    

   ---Significance stars
   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
   Residual standard error: 111800 on 1476 degrees of freedom
   Multiple R-squared: 0.4256
   Adjusted R-squared: 0.4201 
   F-statistic:  78.1 on 14 and 1476 DF,  p-value: < 2.2e-16 
   ***Extra Diagnostic information
   Residual sum of squares: 1.843678e+13
   Sigma(hat): 111274.4
   AIC:  38911.38
   AICc:  38911.75
   BIC:  37622.21
   ***********************************************************************
   *          Results of Geographically Weighted Regression              *
   ***********************************************************************

   *********************Model calibration information*********************
   Kernel function: gaussian 
   Adaptive bandwidth: 43 (number of nearest neighbours)
   Regression points: the same locations as observations are used.
   Distance metric: Euclidean distance metric is used.

   ****************Summary of GWR coefficient estimates:******************
                             Min.     1st Qu.      Median     3rd Qu.
   Intercept          -1.8813e+08  8.2799e+05  2.4401e+06  4.3444e+06
   floor_area_sqm     -1.0108e+04 -2.5601e+03  9.4290e+02  4.2051e+03
   storey_range_bin    2.6740e+03  4.8119e+03  6.3826e+03  7.0950e+03
   remaining_lease_yr -4.5995e+04 -3.5888e+04 -3.3036e+04 -2.4080e+04
   PROX_CBD           -2.8386e+03 -1.2250e+02  9.8875e+00  1.4399e+02
   PROX_ELDER         -1.8907e+04 -1.1301e+02  2.6096e+01  1.3935e+02
   PROX_HAWKER        -5.8586e+03 -1.8703e+02 -2.8870e+01  6.9101e+01
   PROX_MRT           -1.4502e+04 -1.5169e+02 -5.3055e+01  1.5931e+02
   PROX_PARK          -9.5308e+03 -9.8861e+01  1.0846e+00  1.4024e+02
   PROX_MALL          -7.5326e+03 -1.1497e+02 -1.7117e+01  7.8830e+01
   PROX_SPMK          -1.8015e+04 -2.5202e+02 -8.8783e+01  1.7484e+02
   KIND_350           -1.3128e+06 -2.5058e+04  1.0755e+03  4.2140e+04
   CHILD_350          -1.4234e+05 -9.8631e+03  7.4604e+03  2.4447e+04
   BUS_350            -7.0984e+04 -6.8546e+03  2.4158e+03  1.6720e+04
   PRI_1K             -1.3838e+05 -2.4998e+04 -7.9452e+03  2.3860e+04
                            Max.
   Intercept          43850734.1
   floor_area_sqm        37760.0
   storey_range_bin      10173.2
   remaining_lease_yr    -1952.8
   PROX_CBD              14129.3
   PROX_ELDER            11215.0
   PROX_HAWKER            2564.5
   PROX_MRT               6306.6
   PROX_PARK             36947.0
   PROX_MALL             10316.5
   PROX_SPMK               934.0
   KIND_350            3694633.1
   CHILD_350            562584.2
   BUS_350              586842.9
   PRI_1K               751072.7
   ************************Diagnostic information*************************
   Number of data points: 1491 
   Effective number of parameters (2trace(S) - trace(S'S)): 228.8807 
   Effective degrees of freedom (n-2trace(S) + trace(S'S)): 1262.119 
   AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 37639.55 
   AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 37379.37 
   BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 37131.58 
   Residual sum of squares: 5.90681e+12 
   R-square value:  0.8159587 
   Adjusted R-square value:  0.782557 

   ***********************************************************************
   Program stops at: 2024-11-09 01:25:04.850959 

From the results we can see that proximity to eldercare, hawker centers, park, mall and the numbers of kindergartens and childcares within 350m and primary schools within 1km all have a p-value more than 5%.

Note : This is expected since these are higher value properties that are usually owned by affluent individuals who probably have no dependents, have barely any free time to relax and probably have their own mode of transport, hence the lack of statistical significance for aforementioned variables with exception to MRT stations which has been linked to property value. Cannot acertain this statistic since both income and family data for individuals living in these apartments is not available to us.

Computer Test Data Adaptive Bandwidth

test_data <- testing(resale_split)

We also need to check for overlap in the test data

overlap_matrix <- st_equals(test_data$geometry)
overlap_matrix <- sapply(overlap_matrix, function(x) length(x) > 1)
any_overlap <- any(overlap_matrix)
any_overlap
[1] FALSE
test_data_sp <- as_Spatial(test_data)
gwr_bw_test_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
                  data=test_data_sp,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Adaptive bandwidth: 503 CV score: 1.093626e+13 
Adaptive bandwidth: 319 CV score: 1.033642e+13 
Adaptive bandwidth: 203 CV score: 9.551174e+12 
Adaptive bandwidth: 134 CV score: 8.957821e+12 
Adaptive bandwidth: 88 CV score: 8.089356e+12 
Adaptive bandwidth: 63 CV score: 7.306364e+12 
Adaptive bandwidth: 44 CV score: 6.542905e+12 
Adaptive bandwidth: 36 CV score: 6.016534e+12 
Adaptive bandwidth: 27 CV score: 5.756867e+12 
Adaptive bandwidth: 25 CV score: 5.814039e+12 
Adaptive bandwidth: 31 CV score: 5.994204e+12 
Adaptive bandwidth: 27 CV score: 5.756867e+12 

Now we can run prediction on the test dataset

Running Predictions on the test data

st_crs(train_data_sp)
Coordinate Reference System:
  User input: SVY21 / Singapore TM 
  wkt:
PROJCRS["SVY21 / Singapore TM",
    BASEGEOGCRS["SVY21",
        DATUM["SVY21",
            ELLIPSOID["WGS 84",6378137,298.257223563,
                LENGTHUNIT["metre",1]]],
        PRIMEM["Greenwich",0,
            ANGLEUNIT["degree",0.0174532925199433]],
        ID["EPSG",4757]],
    CONVERSION["Singapore Transverse Mercator",
        METHOD["Transverse Mercator",
            ID["EPSG",9807]],
        PARAMETER["Latitude of natural origin",1.36666666666667,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8801]],
        PARAMETER["Longitude of natural origin",103.833333333333,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8802]],
        PARAMETER["Scale factor at natural origin",1,
            SCALEUNIT["unity",1],
            ID["EPSG",8805]],
        PARAMETER["False easting",28001.642,
            LENGTHUNIT["metre",1],
            ID["EPSG",8806]],
        PARAMETER["False northing",38744.572,
            LENGTHUNIT["metre",1],
            ID["EPSG",8807]]],
    CS[Cartesian,2],
        AXIS["northing (N)",north,
            ORDER[1],
            LENGTHUNIT["metre",1]],
        AXIS["easting (E)",east,
            ORDER[2],
            LENGTHUNIT["metre",1]],
    USAGE[
        SCOPE["Cadastre, engineering survey, topographic mapping."],
        AREA["Singapore - onshore and offshore."],
        BBOX[1.13,103.59,1.47,104.07]],
    ID["EPSG",3414]]
gwr_pred <- gwr.predict(formula = resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K, 
                  data= train_data_sp, 
                  predictdata = test_data_sp, 
                  bw=bw_adaptive, 
                  kernel = 'gaussian', 
                  adaptive= TRUE, 
                  longlat = FALSE)

Finding the Errors and Plotting Residuals

RMSE

Now we want to see how effective is our model by computing the residuals

Gwr_pred_df <- as.data.frame(gwr_pred$SDF)

test_data_gwr_pred <- cbind(test_data,
                        Gwr_pred_df)

First we can compare the RMSE.

rmse(test_data_gwr_pred$resale_price, 
     test_data_gwr_pred$prediction)
[1] 73786.94
ggplot(data = test_data_gwr_pred,
       aes(x = prediction,
           y = resale_price)) +
  geom_point()

Plot Residuals

We can also plot this out on the map to see the residuals by location

test_data_gwr_pred$residuals <- test_data_gwr_pred$prediction - test_data_gwr_pred$resale_price
st_crs(test_data_gwr_pred)
Coordinate Reference System:
  User input: EPSG:3414 
  wkt:
PROJCRS["SVY21 / Singapore TM",
    BASEGEOGCRS["SVY21",
        DATUM["SVY21",
            ELLIPSOID["WGS 84",6378137,298.257223563,
                LENGTHUNIT["metre",1]]],
        PRIMEM["Greenwich",0,
            ANGLEUNIT["degree",0.0174532925199433]],
        ID["EPSG",4757]],
    CONVERSION["Singapore Transverse Mercator",
        METHOD["Transverse Mercator",
            ID["EPSG",9807]],
        PARAMETER["Latitude of natural origin",1.36666666666667,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8801]],
        PARAMETER["Longitude of natural origin",103.833333333333,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8802]],
        PARAMETER["Scale factor at natural origin",1,
            SCALEUNIT["unity",1],
            ID["EPSG",8805]],
        PARAMETER["False easting",28001.642,
            LENGTHUNIT["metre",1],
            ID["EPSG",8806]],
        PARAMETER["False northing",38744.572,
            LENGTHUNIT["metre",1],
            ID["EPSG",8807]]],
    CS[Cartesian,2],
        AXIS["northing (N)",north,
            ORDER[1],
            LENGTHUNIT["metre",1]],
        AXIS["easting (E)",east,
            ORDER[2],
            LENGTHUNIT["metre",1]],
    USAGE[
        SCOPE["Cadastre, engineering survey, topographic mapping."],
        AREA["Singapore - onshore and offshore."],
        BBOX[1.13,103.59,1.47,104.07]],
    ID["EPSG",3414]]
mpsz = st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>% st_transform(3414)
Reading layer `MP14_SUBZONE_WEB_PL' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
tmap_mode("view")
tmap mode set to interactive viewing
tm_shape(mpsz) +
    tmap_options(check.and.fix = TRUE) +
    tm_polygons(alpha = 0.4) +
tm_shape(test_data_gwr_pred) +
    tm_dots(col = "residuals",
            alpha = 0.6,
            style = "quantile")
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Variable(s) "residuals" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("plot")
tmap mode set to plotting

Part 7 : Generating Random Forest Model

Now we can move on to creating the random forest model

coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)
train_data_nogeom <- train_data %>%
  st_drop_geometry()

After preparing the data, we can train the model below

Basic Random Forest

set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
             data=train_data_nogeom)

We can view the model output below

rf
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_range_bin + remaining_lease_yr +      PROX_CBD + PROX_ELDER + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SPMK + KIND_350 + CHILD_350 + BUS_350 +      PRI_1K, data = train_data_nogeom) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      1491 
Number of independent variables:  14 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       6089651561 
R squared (OOB):                  0.7172901 

Cleaning up the data

The code now is taking up alot of space, so we nedd to clean up some of the data that we dont need for future

rm(rf)
rm(price_mlr)
rm(resale_split)
rm(gwr_adaptive)
rm(busstop_data)
rm(childcare_data)
rm(cleaned_resale_no_geom)
rm(cleaned_resale_sf)
rm(eldercare_data)
rm(foodcourt_data)
rm(kindergarten_data)
rm(mall_data)
rm(MRT_data)
rm(primarySchool_data)

Geographically weighted Random Forest

set.seed(1234)
gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm +
                  storey_range_bin + remaining_lease_yr +
                  PROX_CBD + PROX_ELDER + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SPMK + KIND_350 +
                  CHILD_350 + BUS_350 +
                  PRI_1K,
                     dframe=train_data_nogeom, 
                     bw=bw_adaptive,
                     kernel="adaptive",
                     coords=coords_train)

Number of Observations: 1491
Number of Independent Variables: 14
Kernel: Adaptive
Neightbours: 43

--------------- Global ML Model Summary ---------------
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_range_bin + remaining_lease_yr +      PROX_CBD + PROX_ELDER + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SPMK + KIND_350 + CHILD_350 + BUS_350 +      PRI_1K, data = train_data_nogeom, num.trees = 500, mtry = 4,      importance = "impurity", num.threads = NULL) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      1491 
Number of independent variables:  14 
Mtry:                             4 
Target node size:                 5 
Variable importance mode:         impurity 
Splitrule:                        variance 
OOB prediction error (MSE):       5602475402 
R squared (OOB):                  0.739907 

Importance:
    floor_area_sqm   storey_range_bin remaining_lease_yr           PROX_CBD 
      2.794002e+12       1.064561e+12       8.613074e+12       4.801998e+12 
        PROX_ELDER        PROX_HAWKER           PROX_MRT          PROX_PARK 
      2.290748e+12       2.063397e+12       1.706451e+12       2.164520e+12 
         PROX_MALL          PROX_SPMK           KIND_350          CHILD_350 
      1.757189e+12       1.498189e+12       3.720786e+11       5.911166e+11 
           BUS_350             PRI_1K 
      9.500828e+11       4.057296e+11 

Mean Square Error (Not OOB): 1134850527.663
R-squared (Not OOB) %: 94.728
AIC (Not OOB): 31117.002
AICc (Not OOB): 31117.328

--------------- Local Model Summary ---------------

Residuals OOB:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-395250  -55818   -8942   -1296   48881  385720 

Residuals Predicted (Not OOB):
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-85418.6 -10008.2  -1099.2   -159.9   7359.0 103193.2 

Local Variable Importance:
                           Min          Max         Mean         StD
floor_area_sqm               0 230479448522  19827073998 28072159374
storey_range_bin    4161588141 170612160215  25305470885 19726055203
remaining_lease_yr 14922847678 522468180484 166905563447 76306617207
PROX_CBD            7736481806 141347268042  36790838993 20893908093
PROX_ELDER          4306763382 215318703281  33543153214 18430940880
PROX_HAWKER         9897029472 274210235457  38525937031 29979574797
PROX_MRT            5904170333 358141242887  35446779701 28977636610
PROX_PARK           7064569407 203902117509  37649035408 22070489991
PROX_MALL           5513830424 268378628166  35574765368 26183009672
PROX_SPMK           5896338657 218795483945  37521090221 21768627551
KIND_350                     0  84598504644   2525243357  4566500444
CHILD_350                    0 109063784222   3712928233  5450132761
BUS_350                      0 180806013156   4856356177  7854727964
PRI_1K                       0 110590608361   3640699897  7950831240

Mean squared error (OOB): 7363687438.936
R-squared (OOB) %: 65.791
AIC (OOB): 33905.262
AICc (OOB): 33905.587
Mean squared error Predicted (Not OOB): 324623255.383
R-squared Predicted (Not OOB) %: 98.492
AIC Predicted (Not OOB): 29250.88
AICc Predicted (Not OOB): 29251.206

Calculation time (in seconds): 4.2504

We can then save the model for future use

write_rds(gwRF_adaptive, "data/model/gwRF_adaptive.rds")

We then re-read it, mostly for running purposes

gwRF_adaptive <- read_rds("data/model/gwRF_adaptive.rds")

Predicting with test data

test_data_nogeom <- cbind(
  test_data, coords_test) %>%
  st_drop_geometry()
gwRF_pred <- predict.grf(gwRF_adaptive, 
                           test_data_nogeom, 
                           x.var.name="X",
                           y.var.name="Y", 
                           local.w=1,
                           global.w=0)
GRF_pred_df <- as.data.frame(gwRF_pred)

test_data_pred <- cbind(test_data,
                        GRF_pred_df)

Save the data for the future

write_rds(test_data_pred, "data/test_results.rds")
test_data_pred <- read_rds( "data/test_results.rds")

Freeing Memory

The model is very large >15Gb so once we got th results, we should free up the memory

rm(gwRF_adaptive)

Viewing Random Forest Prediction Error

Now we can compare the difference in values from predictions vs actual resale value by location

rmse(test_data_pred$resale_price, 
     test_data_pred$gwRF_pred)
[1] 83570.12
ggplot(data = test_data_pred,
       aes(x = gwRF_pred,
           y = resale_price)) +
  geom_point()

Show residuals

test_data_pred$residuals <- test_data_pred$gwRF_pred - test_data_pred$resale_price
st_crs(test_data_pred)
Coordinate Reference System:
  User input: EPSG:3414 
  wkt:
PROJCRS["SVY21 / Singapore TM",
    BASEGEOGCRS["SVY21",
        DATUM["SVY21",
            ELLIPSOID["WGS 84",6378137,298.257223563,
                LENGTHUNIT["metre",1]]],
        PRIMEM["Greenwich",0,
            ANGLEUNIT["degree",0.0174532925199433]],
        ID["EPSG",4757]],
    CONVERSION["Singapore Transverse Mercator",
        METHOD["Transverse Mercator",
            ID["EPSG",9807]],
        PARAMETER["Latitude of natural origin",1.36666666666667,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8801]],
        PARAMETER["Longitude of natural origin",103.833333333333,
            ANGLEUNIT["degree",0.0174532925199433],
            ID["EPSG",8802]],
        PARAMETER["Scale factor at natural origin",1,
            SCALEUNIT["unity",1],
            ID["EPSG",8805]],
        PARAMETER["False easting",28001.642,
            LENGTHUNIT["metre",1],
            ID["EPSG",8806]],
        PARAMETER["False northing",38744.572,
            LENGTHUNIT["metre",1],
            ID["EPSG",8807]]],
    CS[Cartesian,2],
        AXIS["northing (N)",north,
            ORDER[1],
            LENGTHUNIT["metre",1]],
        AXIS["easting (E)",east,
            ORDER[2],
            LENGTHUNIT["metre",1]],
    USAGE[
        SCOPE["Cadastre, engineering survey, topographic mapping."],
        AREA["Singapore - onshore and offshore."],
        BBOX[1.13,103.59,1.47,104.07]],
    ID["EPSG",3414]]
# Load in the mpsz data
mpsz = st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>% st_transform(3414)
Reading layer `MP14_SUBZONE_WEB_PL' from data source 
  `C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\TakeHomeEx\TakeHomeEx03\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21

Plot the map

tmap_mode("view")
tmap mode set to interactive viewing
tm_shape(mpsz) +
    tmap_options(check.and.fix = TRUE) +
    tm_polygons(alpha = 0.4) +
tm_shape(test_data_pred) +
    tm_dots(col = "residuals",
            alpha = 0.6,
            style = "quantile")
Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Variable(s) "residuals" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("plot")
tmap mode set to plotting